import numpy as np
import pandas as pd
import pickle as pkl
from functions import *

#######################################################
#Preprocessing
#######################################################

# Reading in incumbent voting data
congress = pd.read_csv('incumbent.csv')[['rep_current', 'voted_rep_chosen']]

#removing naans
congress = congress[congress['rep_current'].notna()]
congress = congress[congress['voted_rep_chosen'].notna()]

#removing duplicates and exact matches
congress = congress.drop_duplicates()
congress = congress.loc[congress['rep_current'] != congress['voted_rep_chosen']]

#Getting rid of pairs which are duplicates switched around
pairs = []
current_reps, voted_reps = list(congress["rep_current"]), list(congress["voted_rep_chosen"])
new_current_reps, new_voted_reps = [],[]

for i in range(congress.shape[0]):
    cur_rep, vot_rep = current_reps[i], voted_reps[i]
    pair = {cur_rep, vot_rep}
    if not pair in pairs:
        pairs.append(pair)
        new_current_reps.append(cur_rep)
        new_voted_reps.append(vot_rep)
        
#Computing string distances
congress = precompute_distances(new_current_reps, new_voted_reps,combination_type='standard')

#congress.to_csv('Congress_Pairs.csv', index=False)

#loading variables
num_matches = 500
#congress = pd.read_csv('Congress_Pairs.csv')
basic_model = pkl.load(open("iter0_model.p", "rb"))

#######################################################
#Applying Model to String Pairs and Hand Labeling
#######################################################

ranked_matches = get_predictions(basic_model, congress, 'AFSM_score')
#using basic_model to find a ranking for the city pairs, and resetting the indexes
ranked_matches.reset_index(drop=True, inplace=True)

#best_matches is the 500 string pairs which scored the highest according to the basic model, the string
#pairs which will be used for the HITL step
best_matches = ranked_matches.iloc[0:num_matches]

print('Human Labeling Step...')

#asking human about the top 500 pairs
"""
match_pairs = np.array(best_matches[['amicus', 'bonica']])
ground_truth = ask_about_matches(match_pairs)
best_matches['label'] = ground_truth
best_matches.to_csv('Survey_Final_Results.csv',index=False)
"""

